BAP treated hESC (day 8) snRNAseq data analyses (v2)
snRNAseq data analyses
Prerequisites
This experiment has 2 conditions (BAP treated hESC exposed to 20% oxygen and 5% oxygen), with 2 replicates each. The details are provided in the manuscript. The packages that are needed for the analyses were loaded as below. If you need the version information, session information is printed at the bottom of this wiki.
setwd("/work/LAS/geetu-lab/arnstrm/timecourse/5_cr-count.o2stress/version_1")
# load the modules
library(Seurat)
#> Attaching SeuratObject
library(knitr)
library(kableExtra)
library(ggplot2)
library(cowplot)
library(patchwork)
#>
#> Attaching package: 'patchwork'
#> The following object is masked from 'package:cowplot':
#>
#> align_plots
library(metap)
library(multtest)
#> Loading required package: BiocGenerics
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#> dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#> grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#> union, unique, unsplit, which.max, which.min
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(gridExtra)
#>
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:Biobase':
#>
#> combine
#> The following object is masked from 'package:BiocGenerics':
#>
#> combine
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:gridExtra':
#>
#> combine
#> The following object is masked from 'package:Biobase':
#>
#> combine
#> The following objects are masked from 'package:BiocGenerics':
#>
#> combine, intersect, setdiff, union
#> The following object is masked from 'package:kableExtra':
#>
#> group_rows
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(stringr)
library(TissueEnrich)
#> Loading required package: ensurer
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'matrixStats'
#> The following object is masked from 'package:dplyr':
#>
#> count
#> The following objects are masked from 'package:Biobase':
#>
#> anyMissing, rowMedians
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> The following object is masked from 'package:Biobase':
#>
#> rowMedians
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:dplyr':
#>
#> first, rename
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#> Loading required package: IRanges
#>
#> Attaching package: 'IRanges'
#> The following objects are masked from 'package:dplyr':
#>
#> collapse, desc, slice
#> Loading required package: GenomeInfoDb
#>
#> Attaching package: 'SummarizedExperiment'
#> The following object is masked from 'package:SeuratObject':
#>
#> Assays
#> The following object is masked from 'package:Seurat':
#>
#> Assays
#> Loading required package: GSEABase
#> Loading required package: annotate
#> Loading required package: AnnotationDbi
#>
#> Attaching package: 'AnnotationDbi'
#> The following object is masked from 'package:dplyr':
#>
#> select
#> Loading required package: XML
#> Loading required package: graph
#>
#> Attaching package: 'graph'
#> The following object is masked from 'package:XML':
#>
#> addNode
#> The following object is masked from 'package:stringr':
#>
#> boundary
library(gprofiler2)
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✓ tibble 3.1.6 ✓ purrr 0.3.4
#> ✓ tidyr 1.2.0 ✓ forcats 0.5.1
#> ✓ readr 2.1.2
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> x graph::boundary() masks stringr::boundary()
#> x IRanges::collapse() masks dplyr::collapse()
#> x dplyr::combine() masks gridExtra::combine(), Biobase::combine(), BiocGenerics::combine()
#> x matrixStats::count() masks dplyr::count()
#> x IRanges::desc() masks dplyr::desc()
#> x tidyr::expand() masks S4Vectors::expand()
#> x dplyr::filter() masks stats::filter()
#> x S4Vectors::first() masks dplyr::first()
#> x dplyr::group_rows() masks kableExtra::group_rows()
#> x dplyr::lag() masks stats::lag()
#> x BiocGenerics::Position() masks ggplot2::Position(), base::Position()
#> x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
#> x S4Vectors::rename() masks dplyr::rename()
#> x AnnotationDbi::select() masks dplyr::select()
#> x IRanges::slice() masks dplyr::slice()
library(enhancedDimPlot)
library(calibrate)
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:AnnotationDbi':
#>
#> select
#> The following object is masked from 'package:dplyr':
#>
#> select
#> The following object is masked from 'package:patchwork':
#>
#> area
library(ggrepel)
library(dittoSeq)
library(ComplexHeatmap)
#> Loading required package: grid
#> ========================================
#> ComplexHeatmap version 2.10.0
#> Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
#> Github page: https://github.com/jokergoo/ComplexHeatmap
#> Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
#>
#> If you use it in published research, please cite:
#> Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
#> genomic data. Bioinformatics 2016.
#>
#> The new InteractiveComplexHeatmap package can directly export static
#> complex heatmaps into an interactive Shiny app with zero effort. Have a try!
#>
#> This message can be suppressed by:
#> suppressPackageStartupMessages(library(ComplexHeatmap))
#> ========================================
library(scales)
#>
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#>
#> discard
#> The following object is masked from 'package:readr':
#>
#> col_factor
library(ggvenn)
library(plotly)
#>
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ComplexHeatmap':
#>
#> add_heatmap
#> The following object is masked from 'package:MASS':
#>
#> select
#> The following object is masked from 'package:AnnotationDbi':
#>
#> select
#> The following object is masked from 'package:IRanges':
#>
#> slice
#> The following object is masked from 'package:S4Vectors':
#>
#> rename
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following object is masked from 'package:graphics':
#>
#> layout
library(DT)
#>
#> Attaching package: 'DT'
#> The following object is masked from 'package:SeuratObject':
#>
#> JS
#> The following object is masked from 'package:Seurat':
#>
#> JS
library(ape)
#>
#> Attaching package: 'ape'
#> The following objects are masked from 'package:graph':
#>
#> complement, degree, edges
library(enrichR)
#> Welcome to enrichR
#> Checking connection ...
#> Enrichr ... Connection is Live!
#> FlyEnrichr ... Connection is available!
#> WormEnrichr ... Connection is available!
#> YeastEnrichr ... Connection is available!
#> FishEnrichr ... Connection is available!
library(SeuratWrappers)Importing 10x Datasets
The 10X data was already processed with CellRanger (v.4.0.0) and the counts table was ready for us to import for the data analyses. We used the inbuilt function to import this data and to create a Seurat object as described below.
experiment_name = "BAP"
dataset_loc <- "/work/LAS/geetu-lab/arnstrm/timecourse/1_data/input_v6.1.2/with_introns"
ids <- c("5pcO2_r1", "5pcO2_r2", "20pcO2_r1", "20pcO2_r2")
# function d10x.data
d10x.data <- sapply(ids, function(i){
d10x <- Read10X(file.path(dataset_loc,i,"filtered_feature_bc_matrix"))
colnames(d10x) <- paste(sapply(strsplit(colnames(d10x),split="-"), '[[' , 1L ), i, sep="-")
d10x
})
experiment.data <- do.call("cbind", d10x.data)
bapd8.combined <- CreateSeuratObject(
experiment.data,
project = "BAPd8",
min.cells = 10,
min.genes = 200,
names.field = 2,
names.delim = "\\-")
# backup the object
bapd8.temp <- bapd8.combinedData quality insepction
After the data was imported, we checked the quality of the data. Mitochondrial expression is an important criteria (along with other quantitative features of each nuclei) to decide if the nucleus is good or bad. We tested it as follows
MT ratio in nucleus
bapd8.combined$log10GenesPerUMI <- log10(bapd8.combined$nFeature_RNA) / log10(bapd8.combined$nCount_RNA)
bapd8.combined$mitoRatio <- PercentageFeatureSet(object = bapd8.combined, pattern = "^MT-")
bapd8.combined$mitoRatio <- bapd8.combined@meta.data$mitoRatio / 100
metadata <- bapd8.combined@meta.data
metadata$cells <- rownames(metadata)
metadata <- metadata %>%
dplyr::rename(seq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA,
seq_folder = orig.ident)
p <- ggplot(dat = metadata, aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point(alpha = 0.5) +
scale_colour_gradient(low = "gray90", high = "black") + labs(colour="MT ratio") +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_rect(colour = "black")) +
xlab("RNA counts") + ylab("Gene counts") +
stat_smooth(method=lm) +
facet_wrap(~seq_folder, labeller = labeller(seq_folder =
c("20pcO2_r1" = "20% Oxygen (rep1)",
"20pcO2_r2" = "20% Oxygen (rep2)",
"5pcO2_r1" = "5% Oxygen (rep1)",
"5pcO2_r2" = "5% Oxygen (rep2)"))) +
scale_y_continuous(label=comma) +
scale_x_continuous(label=comma)
ggplotly(p)
#> `geom_smooth()` using formula 'y ~ x'Relationship between the total molecules detected (transcripts) vs. total genes detected across samples. Each nucleus is represented as a dot, with the color intensity representing the mitochondrial read ratio in that nucleus.
Number of nuclei per sample
ggplot(metadata, aes(x=seq_folder, fill=seq_folder)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("Number of Nuclei")Number of nuclei found in each sample
Density of nuclei per sample
ggplot(metadata, aes(color=seq_folder, x=nUMI, fill= seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500)Density of nuclei vs. UMIs
Number of Nuclei vs. genes
ggplot(metadata, aes(x=seq_folder, y=log10(nGene), fill=seq_folder)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NNuclei vs NGenes")Nubmer of nuclei vs. genes
Number of Nuclei vs. genes
ggplot(metadata, aes(x=log10GenesPerUMI, color = seq_folder, fill=seq_folder)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)Nubmer of Nuclei vs. genes
Mitochondrial density across samples
ggplot(metadata, aes(color=seq_folder, x=mitoRatio, fill=seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Removed 17 rows containing non-finite values (stat_density).Mitochondrial density across samples
Data filtering
After inspection, we decided to remove all mitochondiral genes as well as ribosomal genes from our analyses.
set up the metadata file and organize
bapd8.combined <- bapd8.temp
df <- bapd8.combined@meta.data
df$replicate <- NA
df$replicate[which(str_detect(df$orig.ident, "5pcO2"))] <- "5pcO2"
df$replicate[which(str_detect(df$orig.ident, "20pcO2"))] <- "20pcO2"
bapd8.combined@meta.data <- df
bapd8.combined[["percent.mt"]] <- PercentageFeatureSet(bapd8.combined, pattern = "^MT-")
datatable(bapd8.combined@meta.data, rownames = TRUE, filter="top", options = list(pageLength = 15, scrollX=T) )set up the metadata file and organize
v1 <- VlnPlot(bapd8.combined, features = "nFeature_RNA", pt.size = 1) +
geom_hline(yintercept=200, color = "red", size=1) +
geom_hline(yintercept=7500, color = "red", size=1) +
theme(legend.position = "none")
v2 <- VlnPlot(bapd8.combined, features = "nCount_RNA", pt.size = 1) +
theme(legend.position = "none")
v3 <- VlnPlot(bapd8.combined, features = "percent.mt", pt.size = 1) +
geom_hline(yintercept=15, color = "red", size=1) +
theme(legend.position = "none")
v1 | v2 | v3Before filtering
B1 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "percent.mt")
B2 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
B1 | B2Preliminary filtering
bapd8.combined <- subset(bapd8.combined, subset = nFeature_RNA > 200 & nFeature_RNA < 7500 & percent.mt < 25)
I1 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "percent.mt")
I2 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
I1 | I2Final filtering
bapd8.combined <- subset(bapd8.combined, subset = nFeature_RNA > 200 & nFeature_RNA < 7500 & percent.mt < 15)
A1 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "percent.mt")
A2 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
A1 | A2Removing ribosomal and MT proteins
counts <- GetAssayData(object = bapd8.combined, slot = "counts")
counts <- counts[grep(pattern = "^MT", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^MT", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^RPL", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^RPS", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^MRPS", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^MRPL", x = rownames(counts), invert = TRUE),]
keep_genes <- Matrix::rowSums(counts) >= 10
filtered_counts <- counts[keep_genes, ]
bapd8.fcombined <- CreateSeuratObject(filtered_counts, meta.data = bapd8.combined@meta.data)
bapd8.fcombined@meta.data <- bapd8.fcombined@meta.data[1:4]
bapd8.combined <- bapd8.fcombinedData integration and Clustering
Seurat package was used for integrating samples and running the snRNAseq analyses.
data integration
set.seed(1111)
bapd8.list <- SplitObject(bapd8.combined, split.by = "orig.ident")
bapd8.list <- lapply(X = bapd8.list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})Seurat
bapd8.anchors <- FindIntegrationAnchors(object.list = bapd8.list, dims = 1:20)
#> Computing 2000 integration features
#> Scaling features for provided objects
#> Finding all pairwise anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#> Found 2945 anchors
#> Filtering anchors
#> Retained 2531 anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#> Found 4571 anchors
#> Filtering anchors
#> Retained 2721 anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#> Found 3619 anchors
#> Filtering anchors
#> Retained 2070 anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#> Found 3814 anchors
#> Filtering anchors
#> Retained 2652 anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#> Found 3227 anchors
#> Filtering anchors
#> Retained 2259 anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#> Found 5013 anchors
#> Filtering anchors
#> Retained 3810 anchors
bapd8.integrated <- IntegrateData(anchorset = bapd8.anchors, dims = 1:20)
#> Merging dataset 2 into 1
#> Extracting anchors for merged samples
#> Finding integration vectors
#> Finding integration vector weights
#> Integrating data
#> Merging dataset 4 into 3
#> Extracting anchors for merged samples
#> Finding integration vectors
#> Finding integration vector weights
#> Integrating data
#> Merging dataset 1 2 into 3 4
#> Extracting anchors for merged samples
#> Finding integration vectors
#> Finding integration vector weights
#> Integrating data
DefaultAssay(bapd8.integrated) <- "integrated"
bapd8.integrated <- ScaleData(bapd8.integrated, verbose = FALSE)
bapd8.integrated <- RunPCA(bapd8.integrated, npcs = 30, verbose = FALSE)
bapd8.integrated <- RunUMAP(bapd8.integrated, reduction = "pca", dims = 1:20)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 17:25:05 UMAP embedding parameters a = 0.9922 b = 1.112
#> 17:25:05 Read 6428 rows and found 20 numeric columns
#> 17:25:05 Using Annoy for neighbor search, n_neighbors = 30
#> 17:25:05 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 17:25:06 Writing NN index file to temp file /tmp/Rtmptux5Nm/file2721276d1959b
#> 17:25:06 Searching Annoy index using 1 thread, search_k = 3000
#> 17:25:08 Annoy recall = 100%
#> 17:25:08 Commencing smooth kNN distance calibration using 1 thread
#> 17:25:10 Initializing from normalized Laplacian + noise
#> 17:25:10 Commencing optimization for 500 epochs, with 279604 positive edges
#> 17:25:17 Optimization finished
bapd8.integrated <- FindNeighbors(bapd8.integrated, reduction = "pca", dims = 1:20)
#> Computing nearest neighbor graph
#> Computing SNN
bapd8.integrated <- FindClusters(bapd8.integrated, resolution = 0.5)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 6428
#> Number of edges: 254094
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8858
#> Number of communities: 11
#> Elapsed time: 0 secondsRenumber the clusters
By default Seurat assigns the cluster identity starting from zero. Since we prefer identity starting from one, we renumbered cluster 0-10 to 1-11.
num.clusters <- nlevels(bapd8.integrated$seurat_clusters)
df <- bapd8.integrated@meta.data
df$new_clusters <- as.factor(as.numeric(df$seurat_clusters))
bapd8.integrated@meta.data <- df
Idents(bapd8.integrated) <- "new_clusters"dimplots (colored based on clusters)
The Dimensional reduction plot was plotted using the Seurat DipPlot function, with colors representing different groups/clusters.
d1 <- enhancedDimPlot(object = bapd8.integrated, grouping_var = 'ident', reduction = "umap", label = TRUE, pt.size = 1, alpha = 0.5) +
ggtitle("A") + xlab("UMAP_1") + ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none", plot.title = element_text(face = "bold"))
ggplotly(d1)
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issuesFig2A: Dimensional reduction plot showing 5,355 nuclei plotted in two dimensions. The colored dots represent individual nuclei and are assigned based on cluster identity
dimplots (colored based on conditions)
d2 <- enhancedDimPlot(object = bapd8.integrated, grouping_var = 'replicate', reduction = "umap", label = FALSE, pt.size = 1, alpha = 0.4) +
ggtitle("B") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.justification = c(1, 1), legend.position = c(1, 1), plot.title = element_text(face = "bold")) +
scale_colour_manual(name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])), expression(paste('5% ', 'O'[2]))),
values = c("20pcO2" = "#0571b0", "5pcO2" = "#ca0020")) +
scale_fill_manual(name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])), expression(paste('5% ', 'O'[2]))),
values = c("20pcO2" = "#0571b0", "5pcO2" = "#ca0020")) +
scale_linetype_manual(values = "blank")
ggplotly(d2)Fig2B: Dimensional reduction plot showing 5,355 nuclei plotted in two dimensions. The colored dots represent individual nuclei and are assigned based on treatment.
dimplots (colored based on samples)
d3 <- enhancedDimPlot(object = bapd8.integrated, grouping_var = 'orig.ident', reduction = "umap", label = FALSE, pt.size = 1, alpha = 0.4) +
ggtitle("C") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.justification = c(1, 1), legend.position = c(1, 1), plot.title = element_text(face = "bold")) +
scale_colour_manual(name = "Replicates",
labels = c(expression(paste('20% ', 'O'[2], ' rep1')), expression(paste('20% ', 'O'[2], ' rep2')), expression(paste('5% ', 'O'[2], ' rep1')), expression(paste('5% ', 'O'[2], ' rep1'))),
values = c("20pcO2_r1" = "#0571b0", "20pcO2_r2" = "#92c5de", "5pcO2_r1" = "#ca0020", "5pcO2_r2" = "#f4a582")) +
scale_fill_manual(name = "Replicates",
labels = c(expression(paste('20% ', 'O'[2], ' rep1')), expression(paste('20% ', 'O'[2], ' rep2')), expression(paste('5% ', 'O'[2], ' rep1')), expression(paste('5% ', 'O'[2], ' rep1'))),
values = c("20pcO2_r1" = "#0571b0", "20pcO2_r2" = "#92c5de", "5pcO2_r1" = "#ca0020", "5pcO2_r2" = "#f4a582")) +
scale_linetype_manual(values = "blank")
ggplotly(d3)Fig2C: Dimensional reduction plot showing 5,355 nuclei plotted in two dimensions. The colored dots represent individual nuclei and are assigned based on replicate
Find markers
Find markers for each cluster. The Seurat command FindMarkers was run using the cluster identity assigned in the previous step. Additional column with the Fold (converted from natural log fold change of seurat output) was added to the table. The filtering was done for genes with avg_FC >= 1.5 and p_val_adj <= 0.05.
DefaultAssay(bapd8.integrated) <- "RNA"
lhs.a <- paste("markers.all.", 1:num.clusters, sep="")
rhs.a <- paste("FindMarkers(bapd8.integrated, ident.1 = ",1:num.clusters," )", sep="")
commands.a <- paste(paste(lhs.a, rhs.a, sep="<-"), collapse=";")
eval(parse(text=commands.a))
#markers.all.1$avg_FC <- 2^markers.all.1$avg_log2FC
lhs.b <- paste("markers.all.", 1:num.clusters, "$avg_FC", sep="")
rhs.b <- paste("2^markers.all.",1:num.clusters,"$avg_log2FC", sep="")
commands.b <- paste(paste(lhs.b, rhs.b, sep="<-"), collapse=";")
eval(parse(text=commands.b))
lhs.c <- paste("markers.filtered.", 1:num.clusters, sep="")
rhs.c <- paste("markers.all.",1:num.clusters," %>% filter(avg_FC >= 1.5) %>% filter(p_val_adj <= 0.05) %>% arrange(desc(avg_FC))", sep="")
commands.c <- paste(paste(lhs.c, rhs.c, sep="<-"), collapse=";")
eval(parse(text=commands.c))
lhs.d <- paste("markers.filtered.names.", 1:num.clusters, sep="")
rhs.d <- paste("rownames(markers.filtered.",1:num.clusters,")", sep="")
commands.d <- paste(paste(lhs.d, rhs.d, sep="<-"), collapse=";")
eval(parse(text=commands.d))Check the number of markers
message (paste("Cluster 1 as", length(markers.filtered.names.1), "markers", sep = " "))
#> Cluster 1 as 627 markers
message (paste("Cluster 2 as", length(markers.filtered.names.2), "markers", sep = " "))
#> Cluster 2 as 449 markers
message (paste("Cluster 3 as", length(markers.filtered.names.3), "markers", sep = " "))
#> Cluster 3 as 319 markers
message (paste("Cluster 4 as", length(markers.filtered.names.4), "markers", sep = " "))
#> Cluster 4 as 682 markers
message (paste("Cluster 5 as", length(markers.filtered.names.5), "markers", sep = " "))
#> Cluster 5 as 405 markers
message (paste("Cluster 6 as", length(markers.filtered.names.6), "markers", sep = " "))
#> Cluster 6 as 34 markers
message (paste("Cluster 7 as", length(markers.filtered.names.7), "markers", sep = " "))
#> Cluster 7 as 371 markers
message (paste("Cluster 8 as", length(markers.filtered.names.8), "markers", sep = " "))
#> Cluster 8 as 399 markers
message (paste("Cluster 9 as", length(markers.filtered.names.9), "markers", sep = " "))
#> Cluster 9 as 426 markers
message (paste("Cluster 10 as", length(markers.filtered.names.10), "markers", sep = " "))
#> Cluster 10 as 457 markers
message (paste("Cluster 11 as", length(markers.filtered.names.11), "markers", sep = " "))
#> Cluster 11 as 656 markersMarker plots
Combined expression of all the markers genes in various clusters. The markers have higher expression in their respecitve cluster than compared to the rest of the clusters.
ggplotColours <- function(n = 6, h = c(0, 360) + 15){
if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}
color_list <- ggplotColours(n=11)
grouped_violinPlots <- function(markersfile, clusternumber, seuratobject = bapd8.integrated) {
dittoPlotVarsAcrossGroups(seuratobject, markersfile,
group.by = "new_clusters", main = paste("Cluster ", clusternumber, " markers"),
xlab = "Clusters",
ylab = "Mean z-score expression",
x.labels = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4",
"Cluster 5", "Cluster 6", "Cluster 7", "Cluster 8",
"Cluster 9", "Cluster 10", "Cluster 11"),
vlnplot.lineweight = 0.5,
legend.show = FALSE,
jitter.size = 0.5,
color.panel = color_list)
}Markers of cluster 1
grouped_violinPlots(markers.filtered.names.1, 1)Markers of cluster 2
grouped_violinPlots(markers.filtered.names.2, 2)Markers of cluster 3
grouped_violinPlots(markers.filtered.names.3, 3)Markers of cluster 4
grouped_violinPlots(markers.filtered.names.4, 4)Markers of cluster 5
grouped_violinPlots(markers.filtered.names.5, 5)Markers of cluster 6
grouped_violinPlots(markers.filtered.names.6, 6)Markers of cluster 7
grouped_violinPlots(markers.filtered.names.7, 7)Markers of cluster 8
grouped_violinPlots(markers.filtered.names.8, 8)Markers of cluster 9
grouped_violinPlots(markers.filtered.names.9, 9)Markers of cluster 10
grouped_violinPlots(markers.filtered.names.10, 10)Markers of cluster 11
grouped_violinPlots(markers.filtered.names.11, 11)Run PlacentaCellEnrich on markers
PlacentaCellEnrich was run command-line using the TissueEnrich R package. We used this to assign cell identity to cluster 2, 3, 5 and 6. The function used for running PCE is as follows:
# Vento-Tormo et al., dataset
input="/work/LAS/geetu-lab/arnstrm/timecourse/1_data/input_v6.1.2/other.info/"
l <-
load(file = paste0(input, "combine-test-expression1.Rdata"))
humanGeneMapping <- dataset$GRCH38$humanGeneMapping
d <- dataset$PlacentaDeciduaBloodData
data <- d$expressionData
cellDetails <- d$cellDetails
# Xiang et al., dataset
te.dataset.xiang <- readRDS(paste0(input, "te.dataset.xiang.rds"))
# Castel et al., dataset
te.dataset.castel <- readRDS(paste0(input, "te.dataset.castel.rds"))
# full names for cell types
xi.md <-
read.csv(
paste0(input, "/md-xi.tsv"),
sep = "\t",
header = TRUE,
row.names = 1
)
vt.md <-
read.csv(
paste0(input, "md-vt.tsv"),
sep = "\t",
header = TRUE,
row.names = 1
)
zp.md <-
read.csv(
paste0(input, "md-zp.tsv"),
sep = "\t",
header = TRUE,
row.names = 1
)
runpce <- function(inputgenelist, barcolor) {
inputGenes <- toupper(inputgenelist)
gs1 <- GeneSet(geneIds = toupper(inputgenelist))
humanGene <-
humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes, ]
inputGenes <- humanGene$Gene
expressionData <-
data[intersect(row.names(data), humanGeneMapping$Gene), ]
se <-
SummarizedExperiment(
assays = SimpleList(as.matrix(expressionData)),
rowData = row.names(expressionData),
colData = colnames(expressionData)
)
cellSpecificGenesExp <-
teGeneRetrieval(se, expressedGeneThreshold = 1)
gs <- GeneSet(geneIds = toupper(inputGenes))
output.vt <- teEnrichmentCustom(gs, cellSpecificGenesExp)
en.output.vt <-
setNames(data.frame(assay(output.vt[[1]]), row.names = rowData(output.vt[[1]])[, 1]),
colData(output.vt[[1]])[, 1])
row.names(cellDetails) <- cellDetails$RName
en.output.vt$Tissue <-
cellDetails[row.names(en.output.vt), "CellName"]
en.output.vt$source <- "VT"
en.output.vt <- en.output.vt[order(-en.output.vt$Log10PValue),]
en.output.vt <-
merge(en.output.vt, vt.md, by = "row.names", all.x = TRUE)
en.output.vt <- rownames_to_column(en.output.vt, var = "Name")
output.xi <- teEnrichmentCustom(gs1, te.dataset.xiang)
en.output.xi <-
setNames(data.frame(assay(output.xi[[1]]), row.names = rowData(output.xi[[1]])[, 1]),
colData(output.xi[[1]])[, 1])
en.output.xi$Tissue <- rownames(en.output.xi)
en.output.xi$source <- "Xi"
en.output.xi <- en.output.xi[order(-en.output.xi$Log10PValue),]
en.output.xi <-
merge(en.output.xi, xi.md, by = "row.names", all.x = TRUE)
en.output.xi <- rownames_to_column(en.output.xi, var = "Name")
output.zp <- teEnrichmentCustom(gs1, te.dataset.castel)
en.output.zp <-
setNames(data.frame(assay(output.zp[[1]]), row.names = rowData(output.zp[[1]])[, 1]),
colData(output.zp[[1]])[, 1])
en.output.zp$Tissue <- rownames(en.output.zp)
en.output.zp$source <- "ZP"
en.output.zp <- en.output.zp[order(-en.output.zp$Log10PValue),]
en.output.zp <-
merge(en.output.zp, zp.md, by = "row.names", all.x = TRUE)
en.output.zp <- rownames_to_column(en.output.zp, var = "Name")
en.conbined <- rbind(en.output.vt, en.output.xi, en.output.zp)
# p <- 0.05
# logp <- -log10(p)
en.conbined <- en.conbined %>%
# mutate(Log10PValue = replace(Log10PValue, Log10PValue < logp, 0))
# en.conbined %>%
group_by(source) %>%
arrange(source, desc(Log10PValue)) %>% dplyr::slice(1:7) %>%
ungroup %>%
mutate(
source = as.factor(source),
CellNames = tidytext::reorder_within(CellNames, Log10PValue, source, sep = ":")
)
ggplot(en.conbined, aes(CellNames, Log10PValue)) + geom_bar(stat = 'identity', fill = barcolor) + theme_minimal() +
theme(
axis.text.x = element_text(
vjust = 1,
hjust = 1,
size = 10
),
axis.text.y = element_text(size = 8),
plot.margin = margin(10, 10, 10, 100),
legend.position = "none",
plot.title = element_text(
color = "black",
size = 10,
face = "bold.italic"
),
axis.title.y = element_blank(),
axis.line.x = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),
axis.ticks.x = element_line(
colour = 'black',
size = 1,
linetype = 'solid'
),
axis.title.x = element_text(
color = "black",
size = 10,
face = "bold"
)
) +
scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks()) + ylab("-log10 p-value") +
facet_wrap( ~ source, scales = "free", ncol = 3) +
coord_flip()
}PCE for cluster 1 markers
runpce(markers.filtered.names.1, color_list[1])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 2 markers
runpce(markers.filtered.names.2, color_list[2])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 3 markers
runpce(markers.filtered.names.3, color_list[3])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 4 markers
runpce(markers.filtered.names.4, color_list[4])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 5 markers
runpce(markers.filtered.names.5, color_list[5])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 6 markers
runpce(markers.filtered.names.6, color_list[6])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 7 markers
runpce(markers.filtered.names.7, color_list[7])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 8 markers
runpce(markers.filtered.names.8, color_list[8])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 9 markers
runpce(markers.filtered.names.9, color_list[9])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.runpce(markers.filtered.names.10, color_list[10])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.runpce(markers.filtered.names.11, color_list[11])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.Inspecting STB clusters (cluster 4 and 8)
# findmarkers
markers.all.4vs8 <- FindMarkers(bapd8.integrated, ident.1 = 4, ident.2 = 8)
markers.all.8vs4 <- FindMarkers(bapd8.integrated, ident.1 = 8, ident.2 = 4)
#convert FC
markers.all.4vs8$avg_FC <- 2^markers.all.4vs8$avg_log2FC
markers.all.8vs4$avg_FC <- 2^markers.all.8vs4$avg_log2FC
#Filter
markers.filtered.4vs8 <- markers.all.4vs8 %>% filter(avg_FC >= 1.5) %>% filter(p_val_adj <= 0.05)
markers.filtered.8vs4 <- markers.all.8vs4 %>% filter(avg_FC >= 1.5) %>% filter(p_val_adj <= 0.05)
#list
markers.filtered.names.4vs8 <- rownames(markers.filtered.4vs8)
markers.filtered.names.8vs4 <- rownames(markers.filtered.8vs4)PCE for cluster 4 markers (vs. 8)
runpce(markers.filtered.names.4vs8, color_list[4])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.PCE for cluster 8 markers (vs. 4)
runpce(markers.filtered.names.4vs8, color_list[5])
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.
#> No background list provided. Using all the
#> genes as background.Plotting functions
Some plotting functions that we used for finalizing violin plots.
# https://divingintogeneticsandgenomics.rbind.io/post/stacked-violin-plot-for-visualizing-single-cell-data-in-seurat/
modify_vlnplot<- function(obj,
feature,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...) {
p<- VlnPlot(obj, features = feature, pt.size = pt.size, ... ) +
xlab("") + ylab(feature) + ggtitle("") +
theme(legend.position = "none",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = rel(1), angle = 0),
axis.text.y = element_text(size = rel(1)),
plot.margin = plot.margin )
return(p)
}
extract_max<- function(p){
ymax<- max(ggplot_build(p)$layout$panel_scales_y[[1]]$range$range)
return(ceiling(ymax))
}
StackedVlnPlot<- function(obj, features,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...) {
plot_list<- purrr::map(features, function(x) modify_vlnplot(obj = obj,feature = x, ...))
# Add back x-axis title to bottom plot. patchwork is going to support this?
plot_list[[length(plot_list)]]<- plot_list[[length(plot_list)]] +
theme(axis.text.x=element_text(), axis.ticks.x = element_line())
# change the y-axis tick to only max value
ymaxs<- purrr::map_dbl(plot_list, extract_max)
plot_list<- purrr::map2(plot_list, ymaxs, function(x,y) x +
scale_y_continuous(breaks = c(y)) +
expand_limits(y = y))
p<- patchwork::wrap_plots(plotlist = plot_list, ncol = 1)
return(p)
}Find all Markers
Find all markers using the in build function of Seurat
bapd8.markers <- FindAllMarkers(bapd8.integrated, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#> Calculating cluster 1
#> Calculating cluster 2
#> Calculating cluster 3
#> Calculating cluster 4
#> Calculating cluster 5
#> Calculating cluster 6
#> Calculating cluster 7
#> Calculating cluster 8
#> Calculating cluster 9
#> Calculating cluster 10
#> Calculating cluster 11
bapd8.markers.ranked.10.percluster <- bapd8.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
datatable(bapd8.markers.ranked.10.percluster, rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T))Find Conserved markers
This is optional. We did not use the marker genes specific for clusters of each condition.
lhs.f <- paste("markers.conserved.", 1:num.clusters, sep="")
rhs.f <- paste("FindConservedMarkers(bapd8.integrated, ident.1 = ",1:num.clusters,', grouping.var = "replicate", verbose = FALSE)', sep="")
commands.f <- paste(paste(lhs.f, rhs.f, sep="<-"), collapse=";")
eval(parse(text=commands.f))Expression tables
To export average expression levels for all genes, summarized based on all cells, each condition and each replicate, we used AverageExpression command for Seurat.
cluster.averages.data <- AverageExpression(bapd8.integrated, slot = "data", assays = "RNA")
condition.averages.data <- AverageExpression(bapd8.integrated, slot = "data", group.by = "orig.ident", assays = "RNA")
replicate.averages.data <- AverageExpression(bapd8.integrated, slot = "data", group.by = "replicate", assays = "RNA")
avg.data <- cluster.averages.data[["RNA"]]
avg.condition <- condition.averages.data[["RNA"]]
avg.replicate <- replicate.averages.data[["RNA"]]
write.table(avg.data, file="snn-average-data.tsv", sep= "\t")
write.table(avg.condition, file="snn-average-condition.tsv", sep= "\t")
write.table(avg.replicate, file="snn-average-replicate.tsv", sep= "\t")
#avg.data$gene <- row.names(avg.data)
#avg.data <- as_tibble(avg.data)
#head(avg.data)
#avg.data <- avg.data[, c(12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)]
#colnames(avg.data) <- c("Gene", paste("Cluster", colnames(avg.data[2:12]), sep = "_"))
#datatable(avg.data, rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T))Cell numbers per clusters
Generate a summary table showing the number of cells in each cluster, broken down by replicates and treatment.
cells <- bapd8.integrated@meta.data %>%
dplyr::group_by(orig.ident, new_clusters, replicate) %>%
dplyr::summarise(length(new_clusters)) %>%
dplyr::rename(sample = orig.ident,
cluster = new_clusters,
condition = replicate,
number.of.cells = `length(new_clusters)`)
#> `summarise()` has grouped output by 'orig.ident', 'new_clusters'. You can
#> override using the `.groups` argument.
#head(cells)
#datatable(cells, rownames = TRUE, filter="top", options = list(pageLength = 10, scrollX=T))
#cells %>%
# group_by(orig.ident, new_clusters ) %>%
# summarize(`length(new_clusters)`)
ggplot(cells, aes(x = cluster, y = number.of.cells, fill = cluster )) +
geom_col() +
facet_wrap(~condition) +
theme_classic() +
theme(legend.position = "none")DE between conditions and Volcano plots
The DE was carried out between the conditions for each cluster. First we modified the metadata to create a separate column that has both the condition as well as the cluster number and then we used Seurat’s FindMarkers to find the genes that are differentially expressed. The genes that have log2FC > 1 or <-1 are shown in color if they also have p-val <0.05.
head(bapd8.integrated@meta.data)
#> orig.ident nCount_RNA nFeature_RNA replicate
#> AAACGAAGTGAGACGT-5pcO2_r1 5pcO2_r1 921 687 5pcO2
#> AAACGCTAGCCGATAG-5pcO2_r1 5pcO2_r1 1786 1053 5pcO2
#> AAAGAACAGGCACTAG-5pcO2_r1 5pcO2_r1 1091 573 5pcO2
#> AAAGAACTCATTTCCA-5pcO2_r1 5pcO2_r1 21054 4986 5pcO2
#> AAAGGATTCCGTAGTA-5pcO2_r1 5pcO2_r1 9718 3540 5pcO2
#> AAAGGTAGTAGGGAGG-5pcO2_r1 5pcO2_r1 18482 5317 5pcO2
#> integrated_snn_res.0.5 seurat_clusters new_clusters
#> AAACGAAGTGAGACGT-5pcO2_r1 2 2 3
#> AAACGCTAGCCGATAG-5pcO2_r1 0 0 1
#> AAAGAACAGGCACTAG-5pcO2_r1 2 2 3
#> AAAGAACTCATTTCCA-5pcO2_r1 4 4 5
#> AAAGGATTCCGTAGTA-5pcO2_r1 8 8 9
#> AAAGGTAGTAGGGAGG-5pcO2_r1 2 2 3
df <- bapd8.integrated@meta.data
df$stim <- (paste(df$replicate,df$new_clusters, sep = "."))
df$stim <- gsub('5pcO2', 'FIVE', df$stim)
df$stim <- gsub('20pcO2', 'TWENTY', df$stim)
bapd8.integrated@meta.data <- df
Idents(bapd8.integrated) <- "stim"
lhs.g <- paste("clus", 1:num.clusters, ".five.twenty", sep="")
rhs.g <- paste('FindMarkers(bapd8.integrated, ident.1 = "FIVE.', 1:num.clusters, '", ident.2 = "TWENTY.', 1:num.clusters, '", verbose = FALSE, logfc.threshold = 0)', sep="")
commands.g <- paste(paste(lhs.g, rhs.g, sep="<-"), collapse=";")
eval(parse(text=commands.g))
# lhs.h <- paste("clus", 1:num.clusters, ".five.twenty$log2fc", sep="")
# rhs.h <- paste('log2(exp(clus', 1:num.clusters, '.five.twenty$avg_logFC))', sep="")
# commands.h <- paste(paste(lhs.h, rhs.h, sep="<-"), collapse=";")
# eval(parse(text=commands.h))
lhs.j <- paste("clus", 1:num.clusters, ".five.twenty$Gene", sep="")
rhs.j <- paste('row.names(clus', 1:num.clusters, '.five.twenty)', sep="")
commands.j <- paste(paste(lhs.j, rhs.j, sep="<-"), collapse=";")
eval(parse(text=commands.j))
lhs.k <- paste("clus", 1:num.clusters, '.five.twenty$diffexpressed <- "other genes"', sep="")
commands.k <- paste(lhs.k , sep=";")
eval(parse(text=commands.k))
lhs.l <- paste('clus', 1:num.clusters,'.five.twenty$diffexpressed[clus',1:num.clusters,'.five.twenty$avg_log2FC >= 0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05] <- "up in 5% O2"', sep="")
commands.l <- paste(lhs.l , sep=";")
eval(parse(text=commands.l))
lhs.m <- paste('clus', 1:num.clusters,'.five.twenty$diffexpressed[clus',1:num.clusters,'.five.twenty$avg_log2FC <= -0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05] <- "up in 20% O2"', sep="")
commands.m <- paste(lhs.m , sep=";")
eval(parse(text=commands.m))
lhs.n <- paste('clus', 1:num.clusters,'.five.twenty$delabel <- ""', sep="")
commands.n <- paste(lhs.n , sep=";")
eval(parse(text=commands.n))
lhs.o <- paste('clus', 1:num.clusters,'.five.twenty$delabel[clus', 1:num.clusters,'.five.twenty$avg_log2FC >= 0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
rhs.o <- paste('clus', 1:num.clusters,'.five.twenty$Gene[clus', 1:num.clusters,'.five.twenty$avg_log2FC >= 0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
commands.o <- paste(paste(lhs.o, rhs.o, sep="<-"), collapse=";")
eval(parse(text=commands.o))
lhs.p <- paste('clus', 1:num.clusters,'.five.twenty$delabel[clus', 1:num.clusters,'.five.twenty$avg_log2FC <= -0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
rhs.p <- paste('clus', 1:num.clusters,'.five.twenty$Gene[clus', 1:num.clusters,'.five.twenty$avg_log2FC <= -0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
commands.p <- paste(paste(lhs.p, rhs.p, sep="<-"), collapse=";")
eval(parse(text=commands.p))Volcano Plot function: This plots allows us to visualize the genes that are overexpressed in each condition along with its p-value. First, we will setup a function to make a volcano plot and then we call them for each cluster and depict them as an interactive plot.
myVolcanoPlot <- function(mydf, clus.number) {
de <- ggplot(data=mydf, aes(x=avg_log2FC, y=-log10(p_val), col=diffexpressed, label=delabel)) +
geom_point(alpha = 0.5) +
theme_classic() +
scale_color_manual(name = "Expression", values=c("#4d4d4d", "#ca0020", "#0571b0")) +
ggtitle(paste("Cluster ", clus.number, ": 20% O2 vs. 5% O2")) +
xlab("Log2 fold change") +
ylab("-log10 pvalue (adjusted)") +
theme(legend.text.align = 0)
return(de)
}Volcano plot for cluster 1
ggplotly(myVolcanoPlot(clus1.five.twenty, 1))Fig.6-1: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 1
Volcano plot for cluster 2
ggplotly(myVolcanoPlot(clus2.five.twenty, 2))Fig.6-2: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 2
ggsave("Figure_6_A.svg", dpi=900, width = 8, height = 6)Volcano plot for cluster 3
ggplotly(myVolcanoPlot(clus3.five.twenty, 3))Fig.6-3: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 3
Volcano plot for cluster 4
ggplotly(myVolcanoPlot(clus4.five.twenty, 4))Fig.6-4: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 4
Volcano plot for cluster 5
ggplotly(myVolcanoPlot(clus5.five.twenty, 5))Fig.6-5: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 5
Volcano plot for cluster 6
ggplotly(myVolcanoPlot(clus6.five.twenty, 6))Fig.6-6: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 6
Volcano plot for cluster 7
ggplotly(myVolcanoPlot(clus7.five.twenty, 7))Fig.6-7: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 7
Volcano plot for cluster 8
ggplotly(myVolcanoPlot(clus8.five.twenty, 8))Fig.6-8: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 8
Volcano plot for cluster 9
ggplotly(myVolcanoPlot(clus9.five.twenty, 9))Fig.6-9: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 9
Volcano plot for cluster 10
ggplotly(myVolcanoPlot(clus10.five.twenty, 10))Fig.6-10: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 10
Volcano plot for cluster 11
ggplotly(myVolcanoPlot(clus11.five.twenty, 11))Fig.6-11: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 11
Figures from the publication
Prepare dataset for individual plots
DefaultAssay(bapd8.integrated) <- "RNA"
Idents(bapd8.integrated) <- "new_clusters"
cluster4n8 <- subset(bapd8.integrated, idents = c("4", "8"))
#cluster2356 <- subset(bapd8.integrated, idents = c("1", "3", "4", "8"))
#cluster2n3 <- subset(bapd8.integrated, idents = c("2", "3"))
Idents(cluster4n8) <- "new_clusters"
#Idents(cluster2356) <- "new_clusters"
#Idents(cluster2n3) <- "new_clusters"
#DefaultAssay(cluster2356) <- "RNA"
DefaultAssay(cluster4n8) <- "RNA"
#DefaultAssay(cluster2n3) <- "RNA"Figure 3
Transcription factors
fig3a <- c("GATA3", "TFAP2A")
multi_dittoPlot(bapd8.integrated, vars = fig3a, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3A: Violin plots showing expression (average log fold change) for genes encoding transcription factors
Structural proteins
fig3b <- c("KRT7", "KRT23")
multi_dittoPlot(bapd8.integrated, vars = fig3b, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3B: Violin plots showing expression (average log fold change) for genes encoding Structural proteins
Hormones
fig3c <- c("CGA", "PGF")
multi_dittoPlot(bapd8.integrated, vars = fig3c, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3C: Violin plots showing expression (average log fold change) for genes encoding Hormones
Transporters and Carcinoembryonic Antigen
fig3d <- c("SLC40A1", "XAGE2")
multi_dittoPlot(bapd8.integrated, vars = fig3d, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3D: Violin plots showing expression (average log fold change) for genes encoding Transporters and Carcinoembryonic Antigen
Enzymes
fig3e <- c("CYP11A1", "HSD3B1")
multi_dittoPlot(bapd8.integrated, vars = fig3e, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3E: Violin plots showing expression (average log fold change) for genes encoding enzymes
Long non-coding RNAs
fig3f <- c("MALAT1", "NEAT1")
multi_dittoPlot(bapd8.integrated, vars = fig3f, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3F: Violin plots showing expression (average log fold change) for genes encoding lncRNAs
Figure 5
inputGenes<-toupper(markers.filtered.names.4)
humanGene<-humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes<-humanGene$Gene
expressionData<-data[intersect(row.names(data),humanGeneMapping$Gene),]
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
print(length(inputGenes))
#> [1] 606
gs<-GeneSet(geneIds=toupper(inputGenes))
output2<-teEnrichmentCustom(gs,cellSpecificGenesExp)
#> No background list provided. Using all the
#> genes as background.
enrichmentOutput<-setNames(data.frame(assay(output2[[1]]),row.names = rowData(output2[[1]])[,1]),colData(output2[[1]])[,1])
row.names(cellDetails)<-cellDetails$RName
enrichmentOutput$Tissue<- cellDetails[row.names(enrichmentOutput),"CellName"]
sct.cluster4.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.cluster4.genes <- humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.cluster4.genes$Gene),]
sct.cluster4.genes <- sct.cluster4.genes$Gene.name
# PCE cluster 6
inputGenes<-toupper(markers.filtered.names.8)
humanGene<-humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes<-humanGene$Gene
expressionData<-data[intersect(row.names(data),humanGeneMapping$Gene),]
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
print(length(inputGenes))
#> [1] 437
gs<-GeneSet(geneIds=toupper(inputGenes))
output2<-teEnrichmentCustom(gs,cellSpecificGenesExp)
#> No background list provided. Using all the
#> genes as background.
enrichmentOutput<-setNames(data.frame(assay(output2[[1]]),row.names = rowData(output2[[1]])[,1]),colData(output2[[1]])[,1])
row.names(cellDetails)<-cellDetails$RName
enrichmentOutput$Tissue<- cellDetails[row.names(enrichmentOutput),"CellName"]
sct.cluster8.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.cluster8.genes <- humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.cluster8.genes$Gene),]
sct.cluster8.genes <- sct.cluster8.genes$Gene.name
x = list(sct.cluster4.genes, sct.cluster8.genes)names(x) <- c("STB genes of Cluster 4","STB genes of Cluster 8")
ggvenn(
x,
fill_color = color_list[4:8],
stroke_size = NA,
set_name_size = 4,
show_percentage = FALSE
)
#> Warning in sprintf("%d", n, 100 * n/sum(n)): one argument not used by format
#> '%d'Fig5A: STB specific genes shared by clusters 5 and 6
fig5a <- c("KRT8", "S100P", "XAGE2")
fig5b <- c("ERVV-1", "TBX3", "GRHL1")
multi_dittoPlot(cluster4n8, vars = fig5a, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 1, color.panel = c(color_list[5:6]))
multi_dittoPlot(cluster4n8, vars = fig5b, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 1, color.panel = c(color_list[5:6]))Fig5: STB specific genes shared by clusters 5 and 6. Some highly expressed STB specific genes show (A) higher expression in cluster 5 and (B) higher expression in cluster 6
Save RDS file
Finally we will save the entire session data to an external file. This can be explored again by loading it in R in the future if there is any need.
saveRDS(bapd8.integrated, 'bapd8.integrated.rds')Session Info
Complete session information
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.2 LTS
#>
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] grid stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] SeuratWrappers_0.3.0 enrichR_3.0
#> [3] ape_5.6-1 DT_0.20
#> [5] plotly_4.10.0 ggvenn_0.1.9
#> [7] scales_1.1.1 ComplexHeatmap_2.10.0
#> [9] dittoSeq_1.6.0 ggrepel_0.9.1
#> [11] calibrate_1.7.7 MASS_7.3-55
#> [13] enhancedDimPlot_0.0.0.9100 forcats_0.5.1
#> [15] purrr_0.3.4 readr_2.1.2
#> [17] tidyr_1.2.0 tibble_3.1.6
#> [19] tidyverse_1.3.1 gprofiler2_0.2.1
#> [21] TissueEnrich_1.14.0 GSEABase_1.56.0
#> [23] graph_1.72.0 annotate_1.72.0
#> [25] XML_3.99-0.8 AnnotationDbi_1.56.2
#> [27] SummarizedExperiment_1.24.0 GenomicRanges_1.46.1
#> [29] GenomeInfoDb_1.30.1 IRanges_2.28.0
#> [31] S4Vectors_0.32.3 MatrixGenerics_1.6.0
#> [33] matrixStats_0.61.0 ensurer_1.1
#> [35] stringr_1.4.0 dplyr_1.0.8
#> [37] gridExtra_2.3 multtest_2.50.0
#> [39] Biobase_2.54.0 BiocGenerics_0.40.0
#> [41] metap_1.8 patchwork_1.1.1
#> [43] cowplot_1.1.1 ggplot2_3.3.5
#> [45] kableExtra_1.3.4 knitr_1.37
#> [47] SeuratObject_4.0.4 Seurat_4.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] SnowballC_0.7.0 scattermore_0.8
#> [3] R.methodsS3_1.8.1 ragg_1.2.1
#> [5] bit64_4.0.5 R.utils_2.11.0
#> [7] irlba_2.3.5 multcomp_1.4-18
#> [9] DelayedArray_0.20.0 data.table_1.14.2
#> [11] rpart_4.1.16 KEGGREST_1.34.0
#> [13] RCurl_1.98-1.6 doParallel_1.0.17
#> [15] generics_0.1.2 TH.data_1.1-0
#> [17] RSQLite_2.2.9 RANN_2.6.1
#> [19] future_1.23.0 tokenizers_0.2.1
#> [21] bit_4.0.4 tzdb_0.2.0
#> [23] mutoss_0.1-12 spatstat.data_2.1-2
#> [25] webshot_0.5.2 xml2_1.3.3
#> [27] lubridate_1.8.0 httpuv_1.6.5
#> [29] assertthat_0.2.1 xfun_0.29
#> [31] hms_1.1.1 jquerylib_0.1.4
#> [33] evaluate_0.14 promises_1.2.0.1
#> [35] fansi_1.0.2 dbplyr_2.1.1
#> [37] readxl_1.3.1 igraph_1.2.11
#> [39] DBI_1.1.2 tmvnsim_1.0-2
#> [41] htmlwidgets_1.5.4 spatstat.geom_2.3-2
#> [43] paletteer_1.4.0 ellipsis_0.3.2
#> [45] RSpectra_0.16-0 crosstalk_1.2.0
#> [47] backports_1.4.1 bookdown_0.24
#> [49] deldir_1.0-6 vctrs_0.3.8
#> [51] SingleCellExperiment_1.16.0 Cairo_1.5-14
#> [53] remotes_2.4.2 ROCR_1.0-11
#> [55] abind_1.4-5 cachem_1.0.6
#> [57] withr_2.4.3 sctransform_0.3.3
#> [59] goftest_1.2-3 mnormt_2.0.2
#> [61] svglite_2.1.0 cluster_2.1.2
#> [63] lazyeval_0.2.2 crayon_1.5.0
#> [65] labeling_0.4.2 pkgconfig_2.0.3
#> [67] qqconf_1.1.1 vipor_0.4.5
#> [69] nlme_3.1-155 rlang_1.0.1
#> [71] globals_0.14.0 lifecycle_1.0.1
#> [73] miniUI_0.1.1.1 sandwich_3.0-1
#> [75] mathjaxr_1.4-0 rsvd_1.0.5
#> [77] modelr_0.1.8 tidytext_0.3.2
#> [79] ggrastr_1.0.1 cellranger_1.1.0
#> [81] polyclip_1.10-0 lmtest_0.9-39
#> [83] Matrix_1.4-0 zoo_1.8-9
#> [85] beeswarm_0.4.0 reprex_2.0.1
#> [87] ggridges_0.5.3 GlobalOptions_0.1.2
#> [89] pheatmap_1.0.12 png_0.1-7
#> [91] viridisLite_0.4.0 rjson_0.2.21
#> [93] bitops_1.0-7 R.oo_1.24.0
#> [95] KernSmooth_2.23-20 Biostrings_2.62.0
#> [97] blob_1.2.2 shape_1.4.6
#> [99] parallelly_1.30.0 spatstat.random_2.1-0
#> [101] memoise_2.0.1 magrittr_2.0.2
#> [103] plyr_1.8.6 ica_1.0-2
#> [105] zlibbioc_1.40.0 compiler_4.1.0
#> [107] RColorBrewer_1.1-2 clue_0.3-60
#> [109] plotrix_3.8-2 fitdistrplus_1.1-6
#> [111] cli_3.2.0 XVector_0.34.0
#> [113] listenv_0.8.0 janeaustenr_0.1.5
#> [115] pbapply_1.5-0 mgcv_1.8-38
#> [117] tidyselect_1.1.1 stringi_1.7.6
#> [119] textshaping_0.3.6 highr_0.9
#> [121] yaml_2.2.2 sass_0.4.0
#> [123] tools_4.1.0 future.apply_1.8.1
#> [125] parallel_4.1.0 circlize_0.4.14
#> [127] rstudioapi_0.13 foreach_1.5.2
#> [129] farver_2.1.0 rmdformats_1.0.3
#> [131] Rtsne_0.15 BiocManager_1.30.16
#> [133] digest_0.6.29 shiny_1.7.1
#> [135] Rcpp_1.0.8 broom_0.7.12
#> [137] later_1.3.0 RcppAnnoy_0.0.19
#> [139] httr_1.4.2 Rdpack_2.1.3
#> [141] colorspace_2.0-2 rvest_1.0.2
#> [143] fs_1.5.2 tensor_1.5
#> [145] reticulate_1.24 splines_4.1.0
#> [147] uwot_0.1.11 rematch2_2.1.2
#> [149] sn_2.0.1 spatstat.utils_2.3-0
#> [151] systemfonts_1.0.4 xtable_1.8-4
#> [153] jsonlite_1.7.3 R6_2.5.1
#> [155] TFisher_0.2.0 pillar_1.7.0
#> [157] htmltools_0.5.2 mime_0.12
#> [159] glue_1.6.1 fastmap_1.1.0
#> [161] codetools_0.2-18 mvtnorm_1.1-3
#> [163] utf8_1.2.2 lattice_0.20-45
#> [165] bslib_0.3.1 spatstat.sparse_2.1-0
#> [167] numDeriv_2016.8-1.1 ggbeeswarm_0.6.0
#> [169] curl_4.3.2 leiden_0.3.9
#> [171] limma_3.50.1 survival_3.2-13
#> [173] rmarkdown_2.11 munsell_0.5.0
#> [175] GetoptLong_1.0.5 GenomeInfoDbData_1.2.7
#> [177] iterators_1.0.14 haven_2.4.3
#> [179] reshape2_1.4.4 gtable_0.3.0
#> [181] rbibutils_2.2.7 spatstat.core_2.4-0